{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/soda/rcappuzz/work/benchmark-join-suggestions\n"
     ]
    }
   ],
   "source": [
    "cd ~/bench"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import polars as pl\n",
    "from pathlib import Path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw_data_path = Path(\"data/source_tables/raw\")\n",
    "yadl_data_path = Path(\"data/source_tables/yadl\")\n",
    "od_data_path = Path(\"data/source_tables/open_data_us\")\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Company employees"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.read_parquet(Path(raw_data_path, \"company-employees-yadl.parquet\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(pl.col(\"target\"), pl.col(\"col_to_embed\")).write_parquet(\n",
    "    Path(yadl_data_path, \"company-employees-depleted-yadl.parquet\")\n",
    ")\n",
    "df.select(pl.col(\"name\"), pl.col(\"target\")).write_parquet(\n",
    "    Path(od_data_path, \"company-employees-depleted_name-open_data.parquet\")\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# US Accidents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.read_parquet(Path(raw_data_path, \"us-accidents-yadl.parquet\"))\n",
    "df.drop(\"col_to_embed\").write_parquet(Path(od_data_path, \"us_accidents-open_data.parquet\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(pl.col(\"target\"), pl.col(\"col_to_embed\")).write_parquet(\n",
    "    Path(yadl_data_path, \"us_accidents-depleted-yadl.parquet\")\n",
    ")\n",
    "df.select(pl.col(\"County\"), pl.col(\"target\")).write_parquet(\n",
    "    Path(od_data_path, \"us_accidents-depleted_County-open_data.parquet\")\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# US Elections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.read_parquet(Path(yadl_data_path, \"us_elections-yadl.parquet\"))\n",
    "df.drop(\"col_to_embed\").write_parquet(Path(od_data_path, \"us_elections-open_data.parquet\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(pl.col(\"target\"), pl.col(\"party\"), pl.col(\"col_to_embed\")).write_parquet(\n",
    "    Path(yadl_data_path, \"us_elections-yadl-depleted.parquet\")\n",
    ")\n",
    "df.select(pl.col(\"county_name\"), pl.col(\"party\"), pl.col(\"target\")).write_parquet(\n",
    "    Path(od_data_path, \"us_elections-depleted_county_name-open_data.parquet\")\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Prepare versions of US elections filtered by party"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_filtered = df.filter(pl.col(\"party\") == \"DEMOCRAT\")\n",
    "df_filtered.select(\n",
    "    pl.col(\"state\"),\n",
    "    pl.col(\"county_name\"),\n",
    "    pl.col(\"party\"),\n",
    "    pl.col(\"target\"),\n",
    "    pl.col(\"col_to_embed\"),\n",
    ").write_parquet(Path(yadl_data_path, \"us_elections_dems.parquet\"))\n",
    "\n",
    "df_filtered.select(pl.col(\"col_to_embed\"),pl.col(\"target\")).write_parquet(\n",
    "    Path(yadl_data_path, \"us_elections_dems-depleted-yadl.parquet\")\n",
    ")\n",
    "df_filtered.select(pl.col(\"county_name\"), pl.col(\"target\")).write_parquet(\n",
    "    Path(od_data_path, \"us_elections_dems-depleted-open_data.parquet\")\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Movies revenue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.read_parquet(Path(raw_data_path, \"movies-yadl.parquet\"))\n",
    "df.drop(\"col_to_embed\").write_parquet(Path(od_data_path, \"movies-open_data.parquet\"))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(pl.col(\"target\"), pl.col(\"col_to_embed\")).write_parquet(\n",
    "    Path(yadl_data_path, \"movies-yadl-depleted.parquet\")\n",
    ")\n",
    "# df.select(pl.col(\"title\"), pl.col(\"target\")).write_parquet(\n",
    "#     Path(od_data_path, \"movies-depleted_title-open_data.parquet\")\n",
    "# )"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Movies votes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.read_parquet(Path(raw_data_path, \"movies-vote-yadl.parquet\"))\n",
    "df.drop(\"col_to_embed\").write_parquet(Path(od_data_path, \"movies_vote-open_data.parquet\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(pl.col(\"target\"), pl.col(\"col_to_embed\")).write_parquet(\n",
    "    Path(yadl_data_path, \"movies_vote-depleted-yadl.parquet\")\n",
    ")\n",
    "df.select(pl.col(\"title\"), pl.col(\"target\")).write_parquet(\n",
    "    Path(od_data_path, \"movies_vote-depleted_title-open_data.parquet\")\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Housing Prices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.read_parquet(Path(yadl_data_path, \"housing_prices-yadl.parquet\")).drop(\"col_to_embed\")\n",
    "df.drop(\"col_to_embed\").write_parquet(Path(od_data_path, \"housing_prices-open_data.parquet\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(pl.col(\"target\"), pl.col(\"County\")).write_parquet(\n",
    "    Path(od_data_path, \"housing_prices-depleted_County-open_data.parquet\")\n",
    ")\n",
    "df.select(pl.col(\"target\"), pl.col(\"Metro\")).write_parquet(\n",
    "    Path(od_data_path, \"housing_prices-depleted_Metro-open_data.parquet\")\n",
    ")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Metam Housing Prices"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.read_csv(Path(raw_data_path, \"housing.csv\"), infer_schema_length=10000)\n",
    "df = df.rename({\"URL (SEE https://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)\": \"URL\", \"ZIP OR POSTAL CODE\": \"ZIP\"})\n",
    "df = df.with_columns(pl.col(\"PRICE\").log10().alias(\"target\")).drop(\"PRICE\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(pl.col(\"target\"), pl.col(\"ZIP\")).write_parquet(\n",
    "    Path(od_data_path, \"housing-depleted_ZIP-open_data.parquet\")\n",
    ")\n",
    "df.select(pl.col(\"target\"), pl.col(\"CITY\")).write_parquet(\n",
    "    Path(od_data_path, \"housing-depleted_CITY-open_data.parquet\")\n",
    ")\n",
    "df.write_parquet(\n",
    "    Path(od_data_path, \"housing-open_data.parquet\")\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Metam Schools"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pl.read_csv(Path(od_data_path, \"schools.csv\"), infer_schema_length=10000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.with_columns(pl.col(\"class\").cast(pl.Int8).alias(\"target\")).drop(\"class\")\n",
    "df = df.rename({\"DBN\": \"col_to_embed\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.select(pl.col(\"target\"), pl.col(\"col_to_embed\")).write_parquet(\n",
    "    Path(od_data_path, \"schools-depleted-open_data.parquet\")\n",
    ")\n",
    "df.write_parquet(\n",
    "    Path(od_data_path, \"schools-open_data.parquet\")\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "bench",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
